library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.4 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 2.0.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(viridis)
## Loading required package: viridisLite
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(hms)
library(httr)
##
## Attaching package: 'httr'
## The following object is masked from 'package:plotly':
##
## config
library(rvest)
##
## Attaching package: 'rvest'
## The following object is masked from 'package:readr':
##
## guess_encoding
knitr::opts_chunk$set(
echo = TRUE,
warning = FALSE,
fig.width = 6,
fig.asp = .6,
out.width = "90%"
)
theme_set(theme_minimal() + theme(legend.position = "bottom"))
options(
ggplot2.continuous.colour = "viridis",
ggplot2.continuous.fill = "virids"
)
scale_colour_discrete = scale_color_viridis_d
scale_fill_discrete = scale_fill_viridis_d
years_1 <- c(1900:2012, 2014)
years_2 <- c(2015:2019)
importing_data = function(x){
if(str_detect(x, str_c(years_1, collapse = "|"))) {
read_csv(x, na = c("NULL", "", "0"), col_types = "cicccciiiicc")
}
else if(str_detect(x, str_c(years_2, collapse = "|"))){
read_csv(x, na = c("NULL", "", "0"), col_types = "cccicccccccccccccccccciiiiccc")
}
}
boston_df <-
tibble(list.files("data", full.names = TRUE)) %>%
setNames("file_name") %>%
mutate(data = map(file_name, importing_data)) %>%
unnest(data) %>%
mutate(year = readr::parse_number(file_name),
city = coalesce(city, residence),
display_name = str_replace_all(display_name, "[^a-zA-Z0-9]", " ")) %>%
filter(!is.na(display_name)) %>%
select(-file_name, -residence)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:hms':
##
## hms
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
winners_df =
boston_df %>%
mutate(
year = as.factor(year),
official_time = as_hms(official_time),
pace = as_hms(pace),
place_overall = as.numeric(place_overall)
)
Make winners over time plot
winners_df %>%
filter(overall == 1) %>%
arrange(year) %>%
ggplot(aes(x = year, y = official_time, group = 1)) +
geom_point() +
geom_path() +
scale_x_discrete(breaks = c(1910, 1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010, 2020))
errors in data (~1:20:00 is fastes here but 1:59 is fasted – but not recorded – https://www.nytimes.com/2019/10/12/sports/eliud-kipchoge-marathon-record.html)
Plotly
Try with seconds – link the official time - specify range on y-axis
winners_df %>%
filter(overall == 1) %>%
plot_ly(type = 'scatter', mode = 'lines', text = ~paste('Name: ', display_name)) %>%
add_trace(x = ~year, y = ~official_time)%>%
layout(yaxis = list(categoryorder = "array", categoryarray = winners_df$official_time))
layout(yaxis = list(categoryorder = “total ascending”))
Marathon records
records_html =
read_html("https://www.topendsports.com/sport/athletics/record-marathon.htm")
record_marathon =
records_html %>%
html_nodes("table") %>%
html_table(fill = T) %>%
lapply(., function(x) setNames(x, c("time", "date", "athlete", "country", "marathon")))
marathon =
record_marathon %>%
as.data.frame() %>%
mutate(
time = as_hms(time),
) %>%
separate(date, into = c("month", "day", "year")) %>%
mutate(year = as.numeric(year)) %>%
select(-month, -day)
marathon %>%
plot_ly(type = 'scatter', mode = 'lines', text = ~paste('Name: ', athlete)) %>%
add_trace(x = ~year, y = ~time)%>%
layout(yaxis = list(categoryorder = "array", categoryarray = winners_df$official_time))
library(purrr)
library(lubridate)
library(lubridate) trial = tibble( date = c(“1:20:02”) )
trial %>% mutate( date = hms(date) )
age x year (intervals?) pace x year plot? Boston winner compared to record winner overall -